import re
import requests
from status import *
from lxml import etree
from settings import *


def get_tid(url):  # 从网址中获取文章ID
    Status.tid = re.compile(
        r'tid=(\d+)').search(url).group(1) if 'https://' in url or 'http://' in url else url
    return Status.tid


def get_authorid(url):  # 从网址中获取作者ID
    Status.aid = re.compile(r'authorid\=(\d+)').search(url).group(1)
    return Status.aid


def get_page(url):  # 从网址中获取页码
    Status.page = re.compile(r'page\=(\d+)').search(url).group(1)
    return Status.page


def generate_url(url, n):
    Status.page = n
    return re.compile(r'page\=(\d+)').sub(f'page={n}', url) if 'https://' in url or 'http://' in url else url + f'-{n}'


def get_html(url, retries=3):
    while True:
        retries -= 1
        try:
            return requests.get(url, headers={'cookie': settings['cookie']})
        except:
            if retries < 0:
                raise
            print(f'获取 {url} 失败，正在重试 ×\n')
            continue


def get_web(url):  # 获取网页并处理
    if not os.path.exists(f'html/{Status.tid}'):
        os.makedirs(f'html/{Status.tid}')
    log(f'正在获取第 {Status.page} 页，链接为：{url}')
    log(f' √\n')
    text = get_html(url).text
    if settings['保存源码']:
        log(f'正在保存源码至：html/{Status.tid}/{Status.tid}-{Status.page}.html')
        with open(f'html/{Status.tid}/{Status.tid}-{Status.page}.html', 'w', encoding='utf-8_sig') as f:
            f.write(text)
        log(f' √\n')
    return etree.HTML(text)


def get_img(url):
    log(f'#')
    content = get_html(url).content
    return content


def get_file(fn):  # 获取文件并处理
    log(f'正在读取文件：html/{Status.tid}/{fn}.html')
    with open(f'html/{Status.tid}/{fn}.html', 'r', encoding='utf-8_sig') as f:
        html = f.read()
    log(f' √\n')
    return etree.HTML(html)


def get(url):  # 获取链接，判断为网址或者文件并处理
    return get_web(url) if 'https' in url or 'http' in url else get_file(url)
